\documentclass[UTF8,a4paper]{memoir}

\usepackage{pgf-pie}

\usepackage{varioref}

% urls
\usepackage[hyphens]{url}

\usepackage{fancyvrb}


% chinese
\usepackage{CJKutf8}

% use of acronyms
\usepackage[printonlyused,withpage]{acronym}

% use of multirow
\usepackage{multirow}

% nice fonts for the math sets
\usepackage{amsfonts}

% languages
\usepackage[francais,english]{babel}
\usepackage[T1]{fontenc}

% hyphen rules
\begin{hyphenrules}{francais}
\hyphenation{pr\'e-f\'e-ren-ces}
\hyphenation{fra-me-work}
\hyphenation{questions-r\'eponses}
\end{hyphenrules}

%pgf plotting
\usepackage{pgfplots}
\usepackage{graphicx}
\usepackage[hang,small,bf]{caption}
\usepackage{subfig}

\usepackage{lscape}
\usepackage{longtable}
\usepackage{rotating}

\usepackage{abstract}

% Chinese
\usepackage{CJK}
\newcommand{\zht}[1]{\begin{CJK*}{UTF8}{gbsn}#1\end{CJK*}}
\newcommand{\zhs}[1]{\begin{CJK}{UTF8}{gbsn}#1\end{CJK}}

\usepackage{qtree}
\usepackage{tikz}
\usetikzlibrary{fit, arrows, decorations.markings,positioning,trees,shapes}

% use of listings
\usepackage{listings}
\usepackage{framed}
\usepackage{MnSymbol}
\lstset{
	language=SQL,
	basicstyle=\ttfamily\fontsize{8}{11}\selectfont,
	aboveskip=6pt plus 2pt, 
	belowskip=2pt plus 8pt,
	morekeywords={PREFIX,java,rdf,rdfs,url,xsd,CONSTRUCT},
	numbers=left,
	numberstyle=\tiny,
}


% use of minitoc
\usepackage{shorttoc,titletoc}

\newcommand\partialtocname{Outline}
\newcommand\ToCrule{\noindent\rule[5pt]{\textwidth}{1.3pt}}
\newcommand\ToCtitle{{\large\bfseries\partialtocname}\vskip2pt\ToCrule}
\makeatletter
\newcommand\Mprintcontents{%
  \ToCtitle
  \ttl@printlist[chapters]{toc}{}{1}{}\par\nobreak
  \ToCrule}
\makeatother


\newcommand\TODO[1]{{\textcolor{red}{\underline{TODO}: #1\\}}}

\hyphenation{spe-ci-fic}

\newcommand{\Chapter}[1]{\chapter{#1} \setcounter{figure}{1}}


\newcommand*\circled[1]{\smash{\scalebox{0.8}{\tikz[baseline=(char.base)]{
    \node[shape=circle,draw,inner sep=2pt] (char) {#1};}}}}

\newcommand*\lightrednode[1]{\smash{\scalebox{0.8}{\tikz[baseline={([yshift=-1pt]meas.base)}]{
\node[circle,minimum size=10pt,draw=black,
	font=\tiny,fill=red!10,node distance=2.5cm](meas){#1};}}}}
\newcommand*\darkrednode[1]{\smash{\tikz[baseline={([yshift=-2pt]meas1Label.base)}]{
\node[circle,minimum size=10pt,draw=black, font=\tiny,node
	distance=.8cm,fill=red!100](meas1Label){#1};}}}
\newcommand*\yellownode[1]{\smash{\scalebox{0.8}{\tikz[baseline={([yshift=0.5pt]yellowNode.base)}]{\node[circle,minimum
size=10pt,draw=black,
font=\tiny,fill=yellow!30,font=\small](yellowNode){#1};}}}}
\newcommand*\lightyellownode[1]{\smash{\scalebox{0.8}{\tikz[baseline={([yshift=-1pt]proj1.base)}]{%
\node[circle,minimum
size=10pt,draw=black,font=\tiny,fill=yellow!10](proj1){#1};}}}}

\begin{document}
\selectlanguage{english}

\begin{titlingpage}
\begin{center}
\noindent {\large \textbf{\'ECOLE CENTRALE PARIS}} \\
\vspace*{0.3cm}
\noindent {\LARGE \textbf{\'ECOLE DOCTORALE 287}} \\
\noindent \textbf{SCIENCES POUR L'ING\'ENIEUR} \\
\vspace*{0.5cm}
\noindent \Huge \textbf{T H \`E S E} \\
\vspace*{0.3cm}
\noindent \large {pour obtenir le titre de} \\
\vspace*{0.1cm}
\noindent \LARGE \textbf{Docteur en Sciences} \\
\vspace*{0.3cm}
\noindent \Large de l'\'Ecole Centrale Paris \\
\vspace*{0.3cm}
\noindent \Large \textbf{Mention : \textsc{Informatique}}\\
\vspace*{0.4cm}
\noindent \large {Pr\'esent\'ee et soutenue par\\}
\noindent \LARGE Nicolas \textsc{Kuchmann-Beauger} \\
\vspace*{0.8cm}
\noindent {\Huge \textbf{Question Answering System\\
in a Business Intelligence Context}}
\\
\vspace*{0.8cm}
\noindent \Large Th\`ese dirig\'ee par Marie-Aude \textsc{Aufaure} \\
\vspace*{0.2cm}
\noindent \Large pr\'epar\'ee \`a SAP BusinessObjects / SAP Research
\\
\noindent \Large et au Laboratoire MAS (EA 4037)\\
\vspace*{0.2cm}
\noindent \large soutenue le 15 f\'evrier 2013 \\
\vspace*{0.5cm}
\end{center}
\noindent \large \textbf{Jury :} \\
\begin{center}
\noindent \large
\begin{tabular}{llcl}
      \textit{Rapporteurs :}	& Matthieu \textsc{Roche}		& - & Universit\'e Montpellier 2\\
				& Juan Carlos \textsc{Trujillo Mond\'ejar}		& - & Universidad de Alicante\\
      \textit{Directeur :}	& Marie-Aude \textsc{Aufaure}		& - & \'Ecole Centrale Paris
\\
      \textit{Examinateurs :}   & Patrick \textsc{Marcel}          & - &
Universit\'e de Tours \\
      				& Yannick \textsc{Cras}
& - & SAP Research\\
      				& \'Elisabeth \textsc{M\'etais}
& - & CNAM Paris\\
      \textit{Invit\'e :}		& Philippe \textsc{Meiniel}
& - & SAP France
\end{tabular}
\end{center}
\end{titlingpage}
\sloppy

%\titlingpage


\pagenumbering{roman}

\setcounter{secnumdepth}{2}
\setcounter{tocdepth}{2}

%\let\origappendix\appendix % save the existing appendix command
%\renewcommand\appendix{\clearpage\pagenumbering{alph}\origappendix}

\setcounter{page}{3}
\chapter*{Acronyms}
\addcontentsline{toc}{chapter}{Acronyms}
\begin{acronym}[TDMA]
\acro{AI}{Artificial Intelligence}
\acro{BI}{Business Intelligence}
\acro{CMS}{Content Management System}
\acro{CRM}{Customer Relationship Management}
\acro{DBMS}{Dababase Management System}
\acro{DSS}{Decision Support Systems}
%\acro{ER}{Entity/Relationship}
\acro{ERP}{Enterprise Resource Planning}
\acro{FOL}{First Order Logic}
\acro{GUI}{General User Interface}
\acro{IDF}{Inverse document frequency}
\acro{IE}{Information Extraction}
\acro{IR}{Information Retrieval}
\acro{MDX}{MultiDimensional eXpression}
\acro{NER}{Named entity recognizer}
\acro{NL}{Natural Language}
\acro{NLP}{Natural Language Processing}
\acro{OLAP}{Online analytical Processing}
\acro{QA}[Q\&A]{Question Answering}
\acro{RDF}{Resource Description Framework}
\acro{SPARQL}[\textsc{SparQL}]{\textsc{SPARQL} Protocol and RDF Query Language}
\acro{SQL}{Structured Query Language}
\acro{SW}{Semantic Web}
\acro{WWW}[Web]{World Wide Web}
\end{acronym}

\cleardoublepage
\tableofcontents
 
\cleardoublepage
\listoffigures
 
\cleardoublepage
\listoftables

\cleardoublepage
\lstlistoflistings


\cleardoublepage
\chapter*{Acknowledgements}
\markboth{Acknowledgements}{Acknowledgements}
This dissertation would not have been possible without the guidance and the
help of several individuals who in one way or another contributed and extended
their valuable assistance in the preparation and completion of this study.

First and foremost, I address my utmost gratitude to Prof. Marie-Aude Aufaure who kindly accepted to supervise my thesis, and to guide my research.

I want to thank Chahab Nastar and Alexis Naibo who took me aboard at SAP, and Philippe Meiniel who gave me many accurate advices and expressed his technical knowledge on many subjects.

I would also like to thank my colleagues and staff at SAP Research in Paris and in Dresden on the one hand, and at \'Ecole Centrale Paris in Ch\^atenay-Malabry on the other hand for the good times we spent together. 

Of course, I must be thankful to my dear friends who always supported me in their own way.

Last but not the least, I thank my dear parents and family for supporting me throughout my
studies and giving me the strength to not giving up, and to \zht{陈绵鹏} who has long stayed on my side no matter what. 
%and \zhs{永鏗} who makes my life enjoyable.



\cleardoublepage
\markboth{Abstract}{Abstract}
\begin{abstract}
The amount and complexity of data generated by information systems keep
increasing in Warehouses. The domain of Business Intelligence (BI) aims at
providing methods and tools to better help users in retrieving those data. Data
sources are distributed over distinct locations and are usually accessible
through various applications. Looking for new information could be a tedious
task, because business users try to reduce their work overload. To tackle this
problem, Enterprise Search is a field that has emerged in the last few years,
and that takes into consideration the different corporate data sources as well
as sources available to the public (e.g. World Wide Web pages). However,
corporate retrieval systems nowadays still suffer from information overload. We
believe that such systems would benefit from Natural Language (NL) approaches
combined with Q\&A techniques. Indeed, NL interfaces allow users to search new
information in their own terms, and thus obtain precise answers instead of
turning to a plethora of documents. In this way, users do not have to employ
exact keywords or appropriate syntax, and can have faster access to new
information. Major challenges for designing such a system are to interface
different applications and their underlying query languages on the one hand,
and to support users' vocabulary and to be easily configured for new
application domains on the other hand.

This thesis outlines an end-to-end Q\&A framework for corporate use-cases that
can be configured in different settings. In traditional BI systems,
user-preferences are usually not taken into account, nor are their specific
contextual situations. State-of-the art systems in this field,
\textsc{Soda}\footnote{L.
Blunschi, C. Jossen, D. Kossmann, M. Mori, and K. Stockinger (2011)
Data-thirsty business analysts need SODA: search over data warehouse. CIKM
2011.} and \textsc{Safe}\footnote{G. Orsi, P. Milano, P. Leonardo, L. Tanca, E.
Zimeo, and U. Sannio (2011) Keyword-based , Context-aware Selection of Natural
Language Query Patterns. EDBT 2011.} do not compute search results on the basis
of users' situation. This thesis introduces a more personalized approach, which
better speaks to end-users' situations. Our main experimentation, in this case,
works as a search interface, which displays search results on a dashboard that
usually takes the form of charts, fact tables, and thumbnails of unstructured
documents.
Depending on users' initial queries, recommendations for alternatives are also
displayed, so as to reduce response time of the overall system. This process is
often seen as a kind of prediction model.

Our work contributes to the following: first, an architecture, implemented with
parallel algorithms, that leverages different data sources, namely structured
and unstructured document repositories through an extensible Q\&A framework, and
this framework can be easily configured for distinct corporate settings;
secondly, a constraint-matching-based translation approach, which replaces a
pivot language with a conceptual model and leads to more personalized
multidimensional queries; thirdly, a set of NL patterns for translating BI
questions in structured queries that can be easily configured in specific
settings. In addition, we have implemented an iPhone/iPad\texttrademark{}
application and an HTML front-end that demonstrate the feasibility of the various approaches
developed through a series of evaluation metrics for the core component and
scenario of the Q\&A framework. To this end, we elaborate on a range of
gold-standard queries that can be used as a basis for evaluating retrieval
systems in this area, and show that our system behave similarly as
the well-known WolframAlpha\texttrademark{} system, depending on the evaluation
settings.
\end{abstract}
\newpage 
\setcounter{footnote}{0}
\selectlanguage{francais}
\markboth{R\'esum\'e}{R\'esum\'e}
\begin{abstract}
Le volume et la complexit\'e des donn\'ees g\'en\'er\'ees par les syst\`emes
d'information croissent de fa\c con singuli\`ere dans les entrep\^ots de
donn\'ees. Le domaine de l'informatique d\'ecisionnelle (aussi appel\'e BI) a
pour objectif d'apporter des m\'ethodes et des outils pour assister les
utilisateurs dans leur t\^ache de recherche d'information. En effet, les sources
de donn\'ees ne sont en g\'en\'eral pas centralis\'ees, et il est souvent
n\'ecessaire d'interagir avec diverses applications. Acc\'eder \`a
l'information est alors une t\^ache ardue, alors que les employ\'es d'une
entreprise cherchent g\'en\'eralement \`a r\'eduire leur charge de travail.
Pour faire face \`a ce constat, le domaine \og Enterprise Search\fg{} s'est
d\'evelopp\'e r\'ecemment, et prend en compte les diff\'erentes sources de
donn\'ees appartenant aussi bien au r\'eseau priv\'e d'entreprise qu'au domaine
public (telles que les pages Internet). Pourtant, les utilisateurs de moteurs
de recherche actuels souffrent toujours de du volume trop important
d'information \`a disposition. Nous pensons que de tels syst\`emes pourraient
tirer parti des m\'ethodes du traitement naturel des langues associ\'ees \`a
celles des syst\`emes de questions/r\'eponses. En effet, les interfaces en
langue naturelle permettent aux utilisateurs de rechercher de l'information en
utilisant leurs propres termes, et d'obtenir des r\'eponses concises et non une
liste de documents dans laquelle l'\'eventuelle bonne r\'eponse doit \^etre
identifi\'ee. De cette fa\c con, les utilisateurs n'ont pas besoin d'employer
une terminologie fig\'ee, ni de formuler des requ\^etes selon une syntaxe tr\`es
pr\'ecise, et peuvent de plus acc\'eder plus rapidement \`a l'information
d\'esir\'ee. Un challenge lors de la construction d'un tel syst\`eme consiste
\`a interagir avec les diff\'erentes applications, et donc avec les langages
utilis\'es par ces applications d'une part, et d'\^etre en mesure de s'adapter
facilement \`a de nouveaux domaines d'application d'autre part.

Notre rapport d\'etaille un syst\`eme de questions/r\'eponses configurable pour
des cas d'utilisation d'entreprise, et le d\'ecrit dans son int\'egralit\'e.
Dans les syst\`emes traditionnels de l'informatique d\'ecisionnelle, les
pr\'ef\'erences utilisateurs ne sont g\'en\'eralement pas prises en compte, ni
d'ailleurs leurs situations ou leur contexte. Les syst\`emes \'etat-de-l'art du
domaine tels que \textsc{Soda}\footnote{L.
Blunschi, C. Jossen, D. Kossmann, M. Mori, and K. Stockinger (2011)
Data-thirsty business analysts need SODA: search over data warehouse. CIKM
2011.} ou \textsc{Safe}\footnote{G. Orsi, P. Milano, P. Leonardo, L. Tanca, E.
Zimeo, and U. Sannio (2011) Keyword-based , Context-aware Selection of Natural
Language Query Patterns. EDBT 2011.} ne g\'en\`erent pas de r\'esultats
calcul\'es \`a partir de l'analyse de la situation des utilisateurs. Ce rapport
introduit une approche plus personnalis\'ee, qui convient mieux aux
utilisateurs finaux. Notre exp\'erimentation principale se traduit par une
interface de type \emph{search} qui affiche les r\'esultats dans un
\emph{dashboard} sous la forme de graphes, de tables de faits ou encore de
miniatures de pages Internet.
En fonction des requ\^etes initiales des utilisateurs, des recommandations de
requ\^etes sont aussi affich\'ees en sus, et ce dans le but de r\'eduire le temps
de r\'eponse global du syst\`eme. En ce sens, ces recommandations sont
comparables \`a des pr\'edictions.

Notre travail se traduit par les contributions suivantes : tout d'abord, une
architecture impl\'ement\'ee via des algorithmes parall\'elis\'es et qui prend en
compte la diversit\'e des sources de donn\'ees, \`a savoir des donn\'ees
structur\'ees ou non structur\'ees dans le cadre d'un \emph{framework} de
questions\-r\'eponses qui peut \^etre facilement configur\'e dans des
environnements diff\'erents. De plus, une approche de traduction bas\'ee sur la
r\'esolution de contrainte, qui remplace le traditionnel langage-pivot par un
mod\`ele conceptuel et qui conduit \`a des requ\^etes multidimensionnelles
mieux personnalis\'ees. En outre, en ensemble de patrons linguistiques
utilis\'es pour traduire des questions BI en des requ\^etes pour bases de
donn\'ees, qui peuvent \^etre facilement adapt\'es dans le cas de
configurations diff\'erentes. Enfin, nous avons impl\'ement\'e une application
pour iPhone/iPad\texttrademark{} et une interface de type \og HTML\fg{} qui
d\'emontre la faisabilit\'e des diff\'erentes approches d\'evelopp\'ees gr\^ace \`a un
ensemble de mesures d'\'evaluations pour l'\'el\'ement principal (le composant
de traduction) et un sc\'enario d'\'evaluation pour le framework dans sa
globalit\'e. Dans ce but, nous introduisons un ensemble de requ\^etes pouvant
servir \`a \'evaluer d'autres syst\`eme de recherche d'information dans le
domaine, et nous montrons que notre syst\`eme se comporte de fa\c con similaire
au syst\`eme de r\'ef\'erence \mbox{WolframAlpha\texttrademark}, en fonction des
param\`etres d'\'evaluation.
\end{abstract}
\selectlanguage{english}


\cleardoublepage
\pagenumbering{arabic}



%\pagenumbering{roman}

% \cleardoublepage


% ----------------------
%  Introduction
% ----------------------

\include{chapter_introduction}

% -----------------------
% State of the Art
% -----------------------
\include{chapter_state-of-the-art}

% ----------------------
% Contextual Q&A
% ----------------------
\include{chapter_contextual-qa}



% -----------------------
% Linguistic patterns
% -----------------------
\include{chapter_patterns}

% ----------------------
% Query Modeling
% ----------------------
\include{chapter_query-modeling}



% -----------------------
% Experiments
% -----------------------
\include{chapter_experiments}

% ----------------------
% Discussion
% ----------------------

% ----------------------
% Conclusion
% ----------------------
\include{chapter_conclusion}



% -----------------------
% Appendix
% -----------------------
\cleardoublepage
% \pagenumbering{roman}
\appendix
\include{appendix}


\bibliographystyle{plain}
\bibliography{these}

\end{document}

